\documentclass[10pt,letterpaper]{article}

\usepackage[margin=1in]{geometry}
\setlength{\itemsep}{2pt}
\setlength{\topsep}{2pt}
\setlength{\itemsep}{2pt}

\newcommand{\ilias}[1]{{\color{blue}{#1}}}
\usepackage{amsmath,amsfonts,graphpap,amscd,mathrsfs,graphicx,lscape}
\usepackage{epsfig,amssymb,amstext,xspace}
\usepackage{float}	
\usepackage{hyperref}
\usepackage{cleveref}
\usepackage[numbers]{natbib}
\usepackage{listings}
\usepackage{color}
\usepackage{longtable}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\usepackage{color}              % Need the color package
\usepackage{epsfig}

\usepackage[]{color-edits}
%\usepackage[suppress]{color-edits}
\addauthor{vs}{green}
\addauthor{mo}{red}
\addauthor{kb}{blue}
\addauthor{gl}{brown}
\usepackage{amsthm}
\usepackage{nicefrac}

\usepackage{subcaption}

\title{Data Documentation}

\begin{document}
	\maketitle    
	\date{}
	\begin{abstract}
		We provide documentation for the datasets used in the testing and benchmarking of CATE estimators. 
	\end{abstract}
	
	\section{Infant Health Development Program (IHDP) Data}
	\subsection{Description}
	The Infant Health Development Program was a randomized study designed to reduce the developmental and health problems of low birthweight premature infants. The study started in 1985 and provided the treatment group with home visits, high quality child care and enrollment at a child development. The children's cognitive and motor skills were tested at age three and an experimental evaluation showed these scores were significantly higher in the treatment group compared to the control group. 
	
		The study collected data on $\sim$80 pretreatment variables. The dataset we employ contains the following subset of the features:
	
\begin{center}
	\begin{longtable}{||c | p{0.15\linewidth} p{0.4\linewidth} l c c||}
		\hline
		 & Feature & Description & Type & \texttt{sim} & \texttt{example}\\ [0.5ex] 
		\hline\hline
		1 & \textbf{treat} & Treatment assignment & Binary & \checkmark
		 & \checkmark \\
		\hline 
		2 & \textbf{bw} & Birth weight in grams & Continuous & \checkmark & \checkmark \\
		\hline
		3 & \textbf{b.head} & Child's head circumference (cm) at birth & Continuous & \checkmark & \\
		\hline
		4 & \textbf{preterm} & Number of weeks infant preterm  & Continuous & \checkmark & \checkmark\\
		\hline
		5 & \textbf{birth.o} & Birth order  & Continuous & \checkmark & \checkmark\\
		\hline
		6 & \textbf{nnhealth} & Neonatal health index  & Continuous & \checkmark & \checkmark \\
		\hline
		7 & \textbf{momage} & Age of mother at child's birth & Continuous & \checkmark & \checkmark \\ 
		\hline
		8 & \textbf{sex} & Child's gender & Binary & \checkmark & \checkmark \\ 
		\hline
		9 & \textbf{twin} & Is child a twin & Binary & \checkmark &  \\ 
		\hline
		10 & \textbf{b.marr} & Was mother married at birth & Binary & \checkmark & \checkmark \\ 
		\hline
		11 & \textbf{mom.lths} & Mother's education: less than high school & Binary & \checkmark & \checkmark \\ 
		\hline
		12 & \textbf{mom.hs} & Mother's education: high school  & Binary & \checkmark & \checkmark\\ 
		\hline
		13 & \textbf{mom.scoll} & Mother's education: some college & Binary & \checkmark & \checkmark \\ 
		\hline
		14 & \textbf{cig} & Did mother smoke during pregnancy & Binary & \checkmark &  \\ 
		\hline
		15 & \textbf{first} & Is child first born & Binary & \checkmark &  \\ 
		\hline
		16 & \textbf{booze} & Did mother drink during pregnancy & Binary & \checkmark &  \\ 
		\hline
		17 & \textbf{drugs} & Did mother do drugs during pregnancy & Binary & \checkmark & \checkmark \\ 
		\hline
		18 & \textbf{work.dur} & Did mother work during pregnancy & Binary & \checkmark & \checkmark \\ 
		\hline
		19 & \textbf{prenatal} & Did mother receive prenatal care & Binary & \checkmark & \\ 
		\hline
		20 & \textbf{site1-side8} & Program site 1 through 8 & Binary & \checkmark & \checkmark \\ 
		\hline
		21 & \textbf{momwhite, black, hisp} & Is mother white, black or hisp & Binary & \checkmark & \checkmark\\ 
		\hline
		22 & \textbf{iqsb.36} & Child IQ score at 36 months & Continuous & & \checkmark\\ \hline
		23 & \textbf{ncdctt} & Participation in the study (100s of days) & Continuous & & \checkmark \\ \hline
		24 & \textbf{dose400} & Is the participation in the study \textgreater 400 days & Binary & & \checkmark\\ \hline
		25 & \textbf{parity} & \# of children the mother has given birth to & Continuous & & \checkmark\\ \hline
		26 & \textbf{moreprem} & \# of other children mom has given birth to prematurely & Continuous & & \checkmark\\ \hline
		27 & \textbf{cigs} & \# cigarettes mother consumes & Continuous & & \checkmark \\ \hline
		28 & \textbf{alcohol} & Units of alcohol mother consumes & Continuous & & \checkmark \\ \hline
		29 & \textbf{ppvt.imp} & Peabody Picture Vocabulary Test for mom one year into program. Missing values are imputed & Continuous & & \checkmark\\ \hline
		30 & \textbf{bwg} & Birth weight group & Binary & & \checkmark \\ \hline
		31 & \textbf{mlt.birt} & Number of multiple births mother has had & Continuous & & \checkmark \\ \hline
		32 & \textbf{livwho} & Family member the child primarily lives with & Categorical & & \checkmark\\ \hline
		33 & \textbf{language} & Primary language spoken at home & Categorical & & \checkmark \\ \hline
		34 & \textbf{whenpren} & Trimester when mother's prenatal care began & Continuous & & \checkmark\\ \hline
		35 & \textbf{otherstudy} & Is participant enrolled in other study & Binary & & \checkmark\\ \hline
		\caption{Overview of covariates selected in two available IHDP datasets (\texttt{sim.csv}, \texttt{example.csv}).}
\end{longtable}
\end{center}
\subsection{Usage}
	
	The dataset was first made available by Jennifer L. Hill who digitized the 1985 IHDP data in order to study the performance of CATE estimators on real-world data (\cite{hill2011bayesian}). We have three datasets available:
	\begin{itemize}
		\item \texttt{example\_full.csv}: The complete, unprocessed IHDP dataset. 
		
		This dataset contains all digitized data for the 985 participants in the study. It is unprocessed and the meaning of the different covariates is not well documented. 
		
		\item \texttt{sim.csv}: Curated dataset for simulating outcomes in the semi-synthetic data setting.
		
		This dataset is derived from the full dataset by selecting only a subset of the \textit{pre-study} covariates. While the treatment assignment has been left intact, the true outcome has been removed. This data should be used together with simulated outcomes. 

		\item \texttt{example.csv}: Curated dataset for calculating treatment effects with real outcomes in the observational data setting.
		
		This dataset is derived from the full dataset and contains a subset of the \textit{pre-treatment} covariates (since treatment started one year after the study, there are more variables available in this dataset). The treatment assignment, the true participation in the study (in days $\rightarrow$ a continuous treatment) and the outcome (IQ at age 3) are available.
	\end{itemize}
	These datasets have been used in literature to compare treatment effect estimators. We now describe their intended usages.
	\subsubsection{Semi-synthetic data setting with generated outcomes and bias insertion}
	In this scenario, the covariates and treatment assignment are used, but a subset of the treatment group (children with non-white mothers) is removed, thus introducing the type of bias one would expect with observational studies. Based on the goal of the CATE estimator comparison, the outcomes are generated as follows:
	\begin{itemize}
		\item Constant treatment effect
		\begin{align*}
		Y(0) & = X\beta_A + \mathcal{N}(0, 1)\\
		Y(1) & = X\beta_A + 4 + \mathcal{N}(0, 1)
		\end{align*}
	where the coefficients of vector $\beta_A$ are sampled from $(0,1,2,3,4)$ with probabilities $(0.5, 0.2, 0.15, 0.1, 0.05)$. It is assumed that $X$ has been standardized prior to the analysis. This is known as ``Response Surface A" in literature. 
		\item Heterogeneous treatment effect 
		\begin{align*}
		Y(0) & = \exp\left((X+W)\beta_B\right) + \mathcal{N}(0, 1)\\
		Y(1) & = X\beta_B - \omega_B^S + \mathcal{N}(0, 1)
		\end{align*}
	where $W$ is a constant matrix with the same shape as $X$ and entries equal to $0.5$, the coefficients of vector $\beta_B$ are sampled from $(0,0.1,0.2,0.3,0.4)$ with probabilities $(0.6, 0.1, 0.1, 0.1, 0.1)$, and $\omega_B^S$ is a calculated for every simulation such that the CATT equals $4$. This is known as ``Response Surface B" in literature. 
	\end{itemize}
	The outcomes are generated for $N=1000$ simulations and the PEHE (precision in estimating heterogeneous treatment effects) is calculated.
	\subsubsection{Observational data setting with binary treatment}
	In this scenario, we consider the true participation level in the study (in days) to be the treatment. Since participants in the treated cohort self-selected into different participation levels, this setting is essentially an observational one.
	
	The participation level varies from $0$ to $468$ days with a median of $372$ days. This variable can be discretized into ``low" (\textless 400 days) and ``high" (\textgreater 400 days). The participants with ``low" participation are removed from the dataset, thus allowing us to compare the control group ($0$ participation) with the treated group ("high" participation), essentially creating a binary treatment variable. Since there is no ground truth for the treatment effect in this case, the author of \cite{hill2011bayesian} suggests using balance statistics to compare estimators.
	\subsubsection{Observational data setting with continuous treatment}
	This scenario is similar to the one above, except that the participation level is not discretized and instead we consider it to be a continuous treatment variable. 
	
	
	\bibliographystyle{plainnat}
	\bibliography{data_doc}
	  
	
\end{document}
	
	